In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [2]:
df=pd.read_csv(r"D:\Downloads\dataset.csv")
In [3]:
df.head()
Out[3]:
VIN (1-10) County City State Postal Code Model Year Make Model Electric Vehicle Type Clean Alternative Fuel Vehicle (CAFV) Eligibility Electric Range Base MSRP Legislative District DOL Vehicle ID Vehicle Location Electric Utility 2020 Census Tract
0 JTMEB3FV6N Monroe Key West FL 33040 2022 TOYOTA RAV4 PRIME Plug-in Hybrid Electric Vehicle (PHEV) Clean Alternative Fuel Vehicle Eligible 42 0 NaN 198968248 POINT (-81.80023 24.5545) NaN 12087972100
1 1G1RD6E45D Clark Laughlin NV 89029 2013 CHEVROLET VOLT Plug-in Hybrid Electric Vehicle (PHEV) Clean Alternative Fuel Vehicle Eligible 38 0 NaN 5204412 POINT (-114.57245 35.16815) NaN 32003005702
2 JN1AZ0CP8B Yakima Yakima WA 98901 2011 NISSAN LEAF Battery Electric Vehicle (BEV) Clean Alternative Fuel Vehicle Eligible 73 0 15.0 218972519 POINT (-120.50721 46.60448) PACIFICORP 53077001602
3 1G1FW6S08H Skagit Concrete WA 98237 2017 CHEVROLET BOLT EV Battery Electric Vehicle (BEV) Clean Alternative Fuel Vehicle Eligible 238 0 39.0 186750406 POINT (-121.7515 48.53892) PUGET SOUND ENERGY INC 53057951101
4 3FA6P0SU1K Snohomish Everett WA 98201 2019 FORD FUSION Plug-in Hybrid Electric Vehicle (PHEV) Not eligible due to low battery range 26 0 38.0 2006714 POINT (-122.20596 47.97659) PUGET SOUND ENERGY INC 53061041500
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112634 entries, 0 to 112633
Data columns (total 17 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   VIN (1-10)                                         112634 non-null  object 
 1   County                                             112634 non-null  object 
 2   City                                               112634 non-null  object 
 3   State                                              112634 non-null  object 
 4   Postal Code                                        112634 non-null  int64  
 5   Model Year                                         112634 non-null  int64  
 6   Make                                               112634 non-null  object 
 7   Model                                              112614 non-null  object 
 8   Electric Vehicle Type                              112634 non-null  object 
 9   Clean Alternative Fuel Vehicle (CAFV) Eligibility  112634 non-null  object 
 10  Electric Range                                     112634 non-null  int64  
 11  Base MSRP                                          112634 non-null  int64  
 12  Legislative District                               112348 non-null  float64
 13  DOL Vehicle ID                                     112634 non-null  int64  
 14  Vehicle Location                                   112610 non-null  object 
 15  Electric Utility                                   112191 non-null  object 
 16  2020 Census Tract                                  112634 non-null  int64  
dtypes: float64(1), int64(6), object(10)
memory usage: 14.6+ MB
In [5]:
df.isna().sum()
Out[5]:
VIN (1-10)                                             0
County                                                 0
City                                                   0
State                                                  0
Postal Code                                            0
Model Year                                             0
Make                                                   0
Model                                                 20
Electric Vehicle Type                                  0
Clean Alternative Fuel Vehicle (CAFV) Eligibility      0
Electric Range                                         0
Base MSRP                                              0
Legislative District                                 286
DOL Vehicle ID                                         0
Vehicle Location                                      24
Electric Utility                                     443
2020 Census Tract                                      0
dtype: int64
In [6]:
df.describe()
Out[6]:
Postal Code Model Year Electric Range Base MSRP Legislative District DOL Vehicle ID 2020 Census Tract
count 112634.000000 112634.000000 112634.000000 112634.000000 112348.000000 1.126340e+05 1.126340e+05
mean 98156.226850 2019.003365 87.812987 1793.439681 29.805604 1.994567e+08 5.296650e+10
std 2648.733064 2.892364 102.334216 10783.753486 14.700545 9.398427e+07 1.699104e+09
min 1730.000000 1997.000000 0.000000 0.000000 1.000000 4.777000e+03 1.101001e+09
25% 98052.000000 2017.000000 0.000000 0.000000 18.000000 1.484142e+08 5.303301e+10
50% 98119.000000 2020.000000 32.000000 0.000000 34.000000 1.923896e+08 5.303303e+10
75% 98370.000000 2022.000000 208.000000 0.000000 43.000000 2.191899e+08 5.305307e+10
max 99701.000000 2023.000000 337.000000 845000.000000 49.000000 4.792548e+08 5.603300e+10
In [7]:
df.shape
Out[7]:
(112634, 17)

EDA_Exploratary Data Analysis¶

In [9]:
df.duplicated().sum()
Out[9]:
0
In [10]:
df["Model"]=df["Model"].fillna(df["Model"].mode()[0])
In [11]:
df["Legislative District"]=df["Legislative District"].fillna(df["Legislative District"].mean())
In [12]:
df["Vehicle Location"]=df["Vehicle Location"].fillna(df["Vehicle Location"].mode()[0])
In [13]:
df["Electric Utility"]=df["Electric Utility"].fillna(df["Electric Utility"].mode()[0])
In [14]:
df.isna().sum()
Out[14]:
VIN (1-10)                                           0
County                                               0
City                                                 0
State                                                0
Postal Code                                          0
Model Year                                           0
Make                                                 0
Model                                                0
Electric Vehicle Type                                0
Clean Alternative Fuel Vehicle (CAFV) Eligibility    0
Electric Range                                       0
Base MSRP                                            0
Legislative District                                 0
DOL Vehicle ID                                       0
Vehicle Location                                     0
Electric Utility                                     0
2020 Census Tract                                    0
dtype: int64
In [15]:
df.to_csv("Analysis on Electric Vehicles")
In [16]:
df.shape
Out[16]:
(112634, 17)
In [17]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112634 entries, 0 to 112633
Data columns (total 17 columns):
 #   Column                                             Non-Null Count   Dtype  
---  ------                                             --------------   -----  
 0   VIN (1-10)                                         112634 non-null  object 
 1   County                                             112634 non-null  object 
 2   City                                               112634 non-null  object 
 3   State                                              112634 non-null  object 
 4   Postal Code                                        112634 non-null  int64  
 5   Model Year                                         112634 non-null  int64  
 6   Make                                               112634 non-null  object 
 7   Model                                              112634 non-null  object 
 8   Electric Vehicle Type                              112634 non-null  object 
 9   Clean Alternative Fuel Vehicle (CAFV) Eligibility  112634 non-null  object 
 10  Electric Range                                     112634 non-null  int64  
 11  Base MSRP                                          112634 non-null  int64  
 12  Legislative District                               112634 non-null  float64
 13  DOL Vehicle ID                                     112634 non-null  int64  
 14  Vehicle Location                                   112634 non-null  object 
 15  Electric Utility                                   112634 non-null  object 
 16  2020 Census Tract                                  112634 non-null  int64  
dtypes: float64(1), int64(6), object(10)
memory usage: 14.6+ MB
In [18]:
df.columns
Out[18]:
Index(['VIN (1-10)', 'County', 'City', 'State', 'Postal Code', 'Model Year',
       'Make', 'Model', 'Electric Vehicle Type',
       'Clean Alternative Fuel Vehicle (CAFV) Eligibility', 'Electric Range',
       'Base MSRP', 'Legislative District', 'DOL Vehicle ID',
       'Vehicle Location', 'Electric Utility', '2020 Census Tract'],
      dtype='object')

Univariate Analysis¶

Import required library - plotly.express¶

In [19]:
import plotly.express as px
In [20]:
fig = px.box(df, y='Electric Range', 
             title="Box Plot of Electric Range",
             labels={'Electric Range': 'Electric Range'},
             color_discrete_sequence=["#FF5733"])  # Custom color (e.g., orange)

# Customize layout
fig.update_layout(yaxis_title="Electric Range", width=800, height=600)

# Show plot
fig.show()
In [21]:
# Histogram for 'Base MSRP'
fig2 = px.histogram(df, x='Base MSRP', 
                    title="Histogram of Base MSRP",
                    labels={'Base MSRP': 'Base MSRP'},
                    nbins=30,
                    color_discrete_sequence=['#EF553B'])  # Another custom color

# Customize layout
fig2.update_layout(xaxis_title="Base MSRP", yaxis_title="Count", width=800, height=600)

# Show plot for 'Base MSRP'
fig2.show()
In [ ]:
 

Histograms for numerical features¶

In [22]:
# Replace 'Electric Range' with the actual column name if it contains spaces.
sns.histplot(df['Electric Range'], 
bins=30, kde=True).set_title('Histogram of Electric Range')
plt.show()
No description has been provided for this image
In [23]:
# Use a darker color palette
sns.countplot(x='Model Year', data=df, palette='dark')
plt.title('Count of Vehicles by Model Year')
plt.xticks(rotation=45)
plt.xlabel('Model Year')
plt.ylabel('Count')
plt.show()
No description has been provided for this image
In [24]:
# Use a dark color palette
sns.countplot(y='Make', data=df, order=df['Make'].value_counts().index, palette='dark')
plt.title('Count of Vehicles by Make')
plt.ylabel('Make')
plt.xlabel('Count')
plt.show()
No description has been provided for this image
In [ ]:
 

Frequency distribution for categorical features¶

Bivariate Analysis¶

Scatter plot using plotly.express¶

Scatter Plot Numerical vs Numerical A( Electric Range vs Make)¶

Task- This is an open ended problem.apply exploratory data analysis (Univariate and Bivariate) on the dataset available above.¶

In [25]:
px.scatter(df,x = "Make",y ="Electric Range")

Box plot using plotly.Expess¶

In [26]:
px.box(df, x = "Electric Vehicle Type", y = "Electric Range")

pie chart plot using plotly.Expess¶

In [28]:
px.pie(df,names = "Make", values = "2020 Census Tract")
In [29]:
df["State"].unique()
Out[29]:
array(['FL', 'NV', 'WA', 'IL', 'NY', 'VA', 'OK', 'KS', 'CA', 'NE', 'MD',
       'CO', 'DC', 'TN', 'SC', 'CT', 'OR', 'TX', 'SD', 'HI', 'GA', 'MS',
       'AR', 'NC', 'MO', 'UT', 'PA', 'DE', 'OH', 'WY', 'AL', 'ID', 'AZ',
       'AK', 'LA', 'NM', 'WI', 'KY', 'NJ', 'MN', 'MA', 'ME', 'RI', 'NH',
       'ND'], dtype=object)
In [30]:
grouped_df = df.groupby("State").agg({"Electric Range":"mean"})
In [31]:
df.shape
Out[31]:
(112634, 17)
In [32]:
# Use a dark color palette
sns.set_palette("dark")

top_cities = df['City'].value_counts().nlargest(10).index  # Top 10 cities
sns.countplot(y='City', data=df[df['City'].isin(top_cities)])
plt.title('Top 10 Cities for Electric Vehicles')
plt.xlabel('Count')
plt.ylabel('City')
plt.show()
No description has been provided for this image

Heatmap of Correlation (for numeric variables)¶

In [33]:
import numpy as np
# Calculate the correlation matrix
correlation_matrix = df.corr()

# Create heatmap for correlation matrix
fig = px.imshow(correlation_matrix, title="Correlation Heatmap", text_auto=True)
fig.show()
C:\Users\hp\AppData\Local\Temp\ipykernel_17068\2181185178.py:3: FutureWarning:

The default value of numeric_only in DataFrame.corr is deprecated. In a future version, it will default to False. Select only valid columns or specify the value of numeric_only to silence this warning.

In [34]:
# Pie chart for Electric Vehicle Type
fig = px.pie(df, names='Electric Vehicle Type', title="Distribution of Electric Vehicle Types")
fig.show()
In [35]:
import pandas as pd
import plotly.express as px

# Load your dataset
df = pd.read_csv(r"D:\Downloads\dataset.csv")

# Print the column names to verify
print(df.columns)

# Count the number of vehicles for each Postal Code and Model Year
state_nyc = df.groupby(['Postal Code', 'Model Year']).size().reset_index(name='Number_of_Vehicles')

# Create a choropleth mapbox
fig = px.choropleth_mapbox(
    state_nyc,
    geojson='https://raw.githubusercontent.com/python-visualization/folium/master/examples/data/us-states.json',
    locations='Postal Code',
    color='Number_of_Vehicles',
    featureidkey="properties.ZCTA5CE10",  # This key must match the geojson structure
    mapbox_style="carto-positron",
    zoom=5,
    center={"lat": 47.7511, "lon": -120.7401},
    title="Number of EV Vehicles based on location in Washington Over Time",
    animation_frame='Model Year',
    hover_data=['Number_of_Vehicles']
)

# Update layout for aesthetics
fig.update_layout(margin={"r": 0, "t": 0, "l": 0, "b": 0})
fig.show()  # Show the animated map
Index(['VIN (1-10)', 'County', 'City', 'State', 'Postal Code', 'Model Year',
       'Make', 'Model', 'Electric Vehicle Type',
       'Clean Alternative Fuel Vehicle (CAFV) Eligibility', 'Electric Range',
       'Base MSRP', 'Legislative District', 'DOL Vehicle ID',
       'Vehicle Location', 'Electric Utility', '2020 Census Tract'],
      dtype='object')
In [36]:
pip install bar_chart_race
Requirement already satisfied: bar_chart_race in c:\users\hp\anaconda3\lib\site-packages (0.1.0)
Requirement already satisfied: pandas>=0.24 in c:\users\hp\anaconda3\lib\site-packages (from bar_chart_race) (1.5.3)
Requirement already satisfied: matplotlib>=3.1 in c:\users\hp\anaconda3\lib\site-packages (from bar_chart_race) (3.8.0)
Requirement already satisfied: contourpy>=1.0.1 in c:\users\hp\anaconda3\lib\site-packages (from matplotlib>=3.1->bar_chart_race) (1.2.0)
Requirement already satisfied: cycler>=0.10 in c:\users\hp\anaconda3\lib\site-packages (from matplotlib>=3.1->bar_chart_race) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in c:\users\hp\anaconda3\lib\site-packages (from matplotlib>=3.1->bar_chart_race) (4.25.0)
Requirement already satisfied: kiwisolver>=1.0.1 in c:\users\hp\anaconda3\lib\site-packages (from matplotlib>=3.1->bar_chart_race) (1.4.4)
Requirement already satisfied: numpy<2,>=1.21 in c:\users\hp\anaconda3\lib\site-packages (from matplotlib>=3.1->bar_chart_race) (1.24.4)
Requirement already satisfied: packaging>=20.0 in c:\users\hp\anaconda3\lib\site-packages (from matplotlib>=3.1->bar_chart_race) (23.1)
Requirement already satisfied: pillow>=6.2.0 in c:\users\hp\anaconda3\lib\site-packages (from matplotlib>=3.1->bar_chart_race) (10.2.0)
Requirement already satisfied: pyparsing>=2.3.1 in c:\users\hp\anaconda3\lib\site-packages (from matplotlib>=3.1->bar_chart_race) (3.0.9)
Requirement already satisfied: python-dateutil>=2.7 in c:\users\hp\anaconda3\lib\site-packages (from matplotlib>=3.1->bar_chart_race) (2.8.2)
Requirement already satisfied: pytz>=2020.1 in c:\users\hp\anaconda3\lib\site-packages (from pandas>=0.24->bar_chart_race) (2023.3.post1)
Requirement already satisfied: six>=1.5 in c:\users\hp\anaconda3\lib\site-packages (from python-dateutil>=2.7->matplotlib>=3.1->bar_chart_race) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
In [37]:
import pandas as pd
import plotly.express as px
from bar_chart_race import bar_chart_race

# Load your dataset
df = pd.read_csv(r"D:\Downloads\dataset.csv")

# Group the data by 'Make' and 'Model Year' and count the number of vehicles
d = df.groupby(['Make', 'Model Year']).size().reset_index(name='Number_of_Vehicles')

# Create the animated racing bar plot
fig = px.bar(d,
              x='Number_of_Vehicles',  # Place the count of EV vehicles on the x-axis
              y='Make',                # Place Make on the y-axis
              color='Make',            # Color each make differently
              animation_frame='Model Year',  # Create animation by year
              orientation='h',         # Horizontal bar chart
              title='EV Makes and Their Count Over the Years',
              labels={'Number_of_Vehicles': 'Number of EV Vehicles'},
              range_x=[0, 3000])      # Set x-axis range

# Update traces for aesthetics
fig.update_traces(texttemplate='%{x}', textposition='outside')  # Display the actual x-axis values
fig.update_layout(yaxis=dict(showgrid=True, gridcolor='LightGray'),  # Show grid for better visibility
                  yaxis_title='EV Makes',                           # Title for y-axis
                  xaxis_title='Number of EV Vehicles',            # Title for x-axis
                  title_x=0.5,                                    # Center title
                  title_font=dict(size=20),                       # Increase title font size
                  width=800, height=600)                          # Set fixed width and height

# Show the plot
fig.show()